First, we need to read in the data and join the training and test data.
train_data <- read_csv("reddit_stress_data/dreaddit-train.csv")
## Rows: 2838 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- read_csv("reddit_stress_data/dreaddit-test.csv")
## Rows: 715 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reddit_stress_data <- add_row(train_data, test_data)
Now we need to find the word distributions. We’ll start by unnesting the tokens and training this on the full dataset.
words_tokenized <- reddit_stress_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
words_tokenized_test <- test_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
words_tokenized_train <- train_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
label_counts <- reddit_stress_data %>%
group_by(label) %>%
count()
plot_ly(label_counts, x = ~label, y = ~n, type = "bar")
subreddit_counts <- reddit_stress_data %>%
group_by(subreddit) %>%
count()
plot_ly(subreddit_counts, x = ~subreddit, y = ~n, kind = "bar")
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
## Warning: 'bar' objects don't have these attributes: 'kind'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'xperiod', 'yperiod', 'xperiod0', 'yperiod0', 'xperiodalignment', 'yperiodalignment', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'textposition', 'insidetextanchor', 'textangle', 'textfont', 'insidetextfont', 'outsidetextfont', 'constraintext', 'cliponaxis', 'orientation', 'base', 'offset', 'width', 'marker', 'offsetgroup', 'alignmentgroup', 'selected', 'unselected', 'r', 't', '_deprecated', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
ggplot(reddit_stress_data, aes(y = label), height = 100, width = 50) + geom_boxplot(fill = "steelblue") + labs(title = "Labels by Subreddit") + facet_grid(label ~ subreddit)
First, to get a better idea of what the overall word distribution is, let’s plot all of the word frequencies.
my_top_word_counts <- words_tokenized %>%
count(word) %>%
arrange(desc(n))
ggplot(my_top_word_counts, aes(y = n)) + geom_boxplot(fill = "steelblue")
What are the top words that appear in the original dataset?
head(my_top_word_counts, 5)
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 im 2235
## 2 %d% 2210
## 3 dont 1184
## 4 feel 967
## 5 time 967
Although some words appear very frequently, most of the words barely appear compared to the top words. Let’s see what happens if we eliminate words based on whether they appear less than or equal to 5 times.
words_tokenized_rare_words_removed <- ReplaceRareWords(reddit_stress_data, rare_defn = 5)
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
ggplot(aes(y = n)) + geom_boxplot(fill = "steelblue")
There seem to still be significant outliers in the dataset. I’m going to look at the one significant outlier and see what it is.
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
head(5)
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 unk 14453
## 2 text_im 2235
## 3 text_%d% 2210
## 4 text_dont 1184
## 5 text_feel 967
Defining the cut off for rare words removal to be 5 reduces the number of words somewhat but the data is still heavily right skewed. Let’s try setting the cut off to 15.
words_tokenized_rare_words_removed <- ReplaceRareWords(reddit_stress_data, rare_defn = 15)
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
ggplot(aes(y = n)) + geom_boxplot(fill = "steelblue")
We have removed a significant number of words but the data is still heavily right skewed. It looks like there is one significant outlier that is skewing the data.
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
head(5)
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 unk 29766
## 2 text_im 2235
## 3 text_%d% 2210
## 4 text_dont 1184
## 5 text_feel 967
length(my_top_word_counts$word)
## [1] 12059
Now let’s see the most common words among the data (overall).
GetTopNMostCommonWords <- function(df, num) {
top_word_counts <- df %>%
count(word) %>%
arrange(desc(n))
return (head(top_word_counts, num))
}
num <- 20
top_10_full_data <- GetTopNMostCommonWords(words_tokenized, num)
Now I will plot the rop 20 most common words in the dataset
ggplot(top_10_full_data, aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Full Dataset", x = "Word", y = "Frequency")
Now let’s see how this varies among label: stressed or non-stressed.
stressed_data <- filter(words_tokenized, label == 0)
non_stressed_data <- filter(words_tokenized, label == 1)
Now let’s plot them
ggplot(GetTopNMostCommonWords(non_stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Non-Stressed Dataset", x = "Word", y = "Frequency")
Now let’s see the difference among stressed data.
ggplot(GetTopNMostCommonWords(stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Stressed Dataset", x = "Word", y = "Frequency")
# Exploring Differences in Subreddit Data Now we’re going to examine the differences by subreddit. First, we will see what unique subreddits have been selected. For each subreddit, I want to examine the difference between the labels and the different words among each label.
unique(reddit_stress_data$subreddit)
## [1] "ptsd" "assistance" "relationships" "survivorsofabuse"
## [5] "domesticviolence" "anxiety" "homeless" "stress"
## [9] "almosthomeless" "food_pantry"
I’m interested in understanding how the data is distributed among each of these subreddits.
ggplot(reddit_stress_data, aes(x = subreddit)) + geom_bar(fill = "steelblue")
Now, let’s check out the ptsd subreddit.
ptsd_data <- filter(words_tokenized, subreddit == "ptsd")
ggplot(filter(reddit_stress_data, subreddit == "ptsd"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "PTSD Data by Label")
ggplot(GetTopNMostCommonWords(ptsd_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the PTSD Subreddit", x = "Word", y = "Frequency")
## Domestic Violence Subreddit Now, let’s check out the domestic violence subreddit.
domestic_violence_data <- filter(words_tokenized, subreddit == "domesticviolence")
ggplot(filter(reddit_stress_data, subreddit == "domesticviolence"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Domestic Violence Data by Label")
ggplot(GetTopNMostCommonWords(domestic_violence_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Domestic Violence Subreddit", x = "Word", y = "Frequency")
## Almost Homeless Subreddit Now let’s check out the almost homeless subreddit
almost_homeless_data <- filter(words_tokenized, subreddit == "almosthomeless")
ggplot(filter(reddit_stress_data, subreddit == "almosthomeless"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Almost Homeless Data by Label")
ggplot(GetTopNMostCommonWords(almost_homeless_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Almost Homeless Subreddit", x = "Word", y = "Frequency")
## Assistance Subreddit
assistance_subreddit <- filter(words_tokenized, subreddit == "assistance")
ggplot(filter(reddit_stress_data, subreddit == "assistance"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Assistance Data by Label")
ggplot(GetTopNMostCommonWords(assistance_subreddit, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 20 Words from the Assistance Subreddit", x = "Word", y = "Frequency")
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")
## Warning: Ignoring unknown parameters: bins
## By Label
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)
## Warning: Ignoring unknown parameters: bins
## By Subreddit
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ subreddit)
## Warning: Ignoring unknown parameters: bins
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label)
## Warning: Ignoring unknown parameters: bins